# Computations
import numpy as np
import pandas as pd
import pickle
# Tools
import os
import datetime
import itertools
import pickle
# Sklearn
from sklearn.impute import SimpleImputer
# Visualisation libraries
## progressbar
import progressbar
## Text
from colorama import Fore, Back, Style
from IPython.display import Image, display, Markdown, Latex, clear_output
## plotly
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go
import plotly.offline as py
from plotly.subplots import make_subplots
import plotly.express as px
import warnings
warnings.filterwarnings("ignore")

In this article, we use a dataset from Kaggle.com.
Ask a home buyer to describe their dream house, and they probably won't begin with the height of the basement ceiling or the proximity to an east-west railroad. But this playground competition's dataset proves that much more influences price negotiations than the number of bedrooms or a white-picket fence.
With 79 explanatory variables describing (almost) every aspect of residential homes in Ames, Iowa, this competition challenges you to predict the final price of each home.
Here's a brief version of what you'll find in the data description file.
| Feature | Description | Feature | Description |
|---|---|---|---|
| SalePrice | the property's sale price in dollars. | HeatingQC | Heating quality and condition |
| MSSubClass | The building class | CentralAir | Central air conditioning |
| MSZoning | The general zoning classification | Electrical | Electrical system |
| LotFrontage | Linear feet of street connected to property | 1stFlrSF | First Floor square feet |
| LotArea | Lot size in square feet | 2ndFlrSF | Second floor square feet |
| Street | Type of road access | LowQualFinSF | Low quality finished square feet (all floors) |
| Alley | Type of alley access | GrLivArea | Above grade (ground) living area square feet |
| LotShape | General shape of property | BsmtFullBath | Basement full bathrooms |
| LandContour | Flatness of the property | BsmtHalfBath | Basement half bathrooms |
| Utilities | Type of utilities available | FullBath | Full bathrooms above grade |
| LotConfig | Lot configuration | HalfBath | Half baths above grade |
| LandSlope | Slope of property | Bedroom | Number of bedrooms above basement level |
| Neighborhood | Physical locations within Ames city limits | Kitchen | Number of kitchens |
| Condition1 | Proximity to main road or railroad | KitchenQual | Kitchen quality |
| Condition2 | Proximity to main road or railroad (if a second is present) | TotRmsAbvGrd | Total rooms above grade (does not include bathrooms) |
| BldgType | Type of dwelling | Functional | Home functionality rating |
| HouseStyle | Style of dwelling | Fireplaces | Number of fireplaces |
| OverallQual | Overall material and finish quality | FireplaceQu | Fireplace quality |
| OverallCond | Overall condition rating | GarageType | Garage location |
| YearBuilt | Original construction date | GarageYrBlt | Year garage was built |
| YearRemodAdd | Remodel date | GarageFinish | Interior finish of the garage |
| RoofStyle | Type of roof | GarageCars | Size of garage in car capacity |
| RoofMatl | Roof material | GarageArea | Size of garage in square feet |
| Exterior1st | Exterior covering on house | GarageQual | Garage quality |
| Exterior2nd | Exterior covering on house (if more than one material) | GarageCond | Garage condition |
| MasVnrType | Masonry veneer type | PavedDrive | Paved driveway |
| MasVnrArea | Masonry veneer area in square feet | WoodDeckSF | Wood deck area in square feet |
| ExterQual | Exterior material quality | OpenPorchSF | Open porch area in square feet |
| ExterCond | Present condition of the material on the exterior | EnclosedPorch | Enclosed porch area in square feet |
| Foundation | Type of foundation | 3SsnPorch | Three season porch area in square feet |
| BsmtQual | Height of the basement | ScreenPorch | Screen porch area in square feet |
| BsmtCond | General condition of the basement | PoolArea | Pool area in square feet |
| BsmtExposure | Walkout or garden level basement walls | PoolQC | Pool quality |
| BsmtFinType1 | Quality of basement finished area | Fence | Fence quality |
| BsmtFinSF1 | Type 1 finished square feet | MiscFeature | Miscellaneous feature not covered in other categories |
| BsmtFinType2 | Quality of second finished area (if present) | MiscVal | Value of miscellaneous feature |
| BsmtFinSF2 | Type 2 finished square feet | MoSold | Month Sold |
| BsmtUnfSF | Unfinished square feet of basement area | YrSold | Year Sold |
| TotalBsmtSF | Total square feet of basement area | SaleType | Type of sale |
| Heating | Type of heating | SaleCondition | Condition of sale |
def Header(Text, L = 100, C = 'Blue', T = 'White'):
BACK = {'Black': Back.BLACK, 'Red':Back.RED, 'Green':Back.GREEN, 'Yellow': Back.YELLOW, 'Blue': Back.BLUE,
'Magenta':Back.MAGENTA, 'Cyan': Back.CYAN}
FORE = {'Black': Fore.BLACK, 'Red':Fore.RED, 'Green':Fore.GREEN, 'Yellow':Fore.YELLOW, 'Blue':Fore.BLUE,
'Magenta':Fore.MAGENTA, 'Cyan':Fore.CYAN, 'White': Fore.WHITE}
print(BACK[C] + FORE[T] + Style.NORMAL + Text + Style.RESET_ALL + ' ' + FORE[C] +
Style.NORMAL + (L- len(Text) - 1)*'=' + Style.RESET_ALL)
def Line(L=100, C = 'Blue'):
FORE = {'Black': Fore.BLACK, 'Red':Fore.RED, 'Green':Fore.GREEN, 'Yellow':Fore.YELLOW, 'Blue':Fore.BLUE,
'Magenta':Fore.MAGENTA, 'Cyan':Fore.CYAN, 'White': Fore.WHITE}
print(FORE[C] + Style.NORMAL + L*'=' + Style.RESET_ALL)
def Search_List(Key, List): return [s for s in List if Key in s]
PATH = 'house-prices-advanced-regression-techniques'
Files = ['train.csv','test.csv']
#
Files_Info = pd.DataFrame()
for i in range(len(Files)):
# Read files
Header(Files[i])
filename = Files[i].split('.')[0]
globals() [filename] = pd.read_csv(os.path.join(PATH, Files[i]))
Temp = globals() [filename].shape
Files_Info = Files_Info.append(pd.DataFrame({'File':[Files[i]], 'Number of Instances': [Temp[0]],
'Number of Attributes': [Temp[1]]}), ignore_index = True)
# Datetime
Cols = globals() [filename].columns
DateTime = Search_List('date', Cols)
if len(DateTime)>0:
try: DateTime.remove('date_block_num')
except: pass
for c in DateTime:
globals() [filename][c] = pd.to_datetime(globals() [filename][c])
del c
# Display
display(globals() [filename].head(5))
Line()
display(Files_Info.style.hide_index())
Line()
del Files, i, Temp, filename, Cols, DateTime
train.csv ==========================================================================================
| Id | MSSubClass | MSZoning | LotFrontage | LotArea | Street | Alley | LotShape | LandContour | Utilities | ... | PoolArea | PoolQC | Fence | MiscFeature | MiscVal | MoSold | YrSold | SaleType | SaleCondition | SalePrice | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 60 | RL | 65.0 | 8450 | Pave | NaN | Reg | Lvl | AllPub | ... | 0 | NaN | NaN | NaN | 0 | 2 | 2008 | WD | Normal | 208500 |
| 1 | 2 | 20 | RL | 80.0 | 9600 | Pave | NaN | Reg | Lvl | AllPub | ... | 0 | NaN | NaN | NaN | 0 | 5 | 2007 | WD | Normal | 181500 |
| 2 | 3 | 60 | RL | 68.0 | 11250 | Pave | NaN | IR1 | Lvl | AllPub | ... | 0 | NaN | NaN | NaN | 0 | 9 | 2008 | WD | Normal | 223500 |
| 3 | 4 | 70 | RL | 60.0 | 9550 | Pave | NaN | IR1 | Lvl | AllPub | ... | 0 | NaN | NaN | NaN | 0 | 2 | 2006 | WD | Abnorml | 140000 |
| 4 | 5 | 60 | RL | 84.0 | 14260 | Pave | NaN | IR1 | Lvl | AllPub | ... | 0 | NaN | NaN | NaN | 0 | 12 | 2008 | WD | Normal | 250000 |
5 rows × 81 columns
test.csv ===========================================================================================
| Id | MSSubClass | MSZoning | LotFrontage | LotArea | Street | Alley | LotShape | LandContour | Utilities | ... | ScreenPorch | PoolArea | PoolQC | Fence | MiscFeature | MiscVal | MoSold | YrSold | SaleType | SaleCondition | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1461 | 20 | RH | 80.0 | 11622 | Pave | NaN | Reg | Lvl | AllPub | ... | 120 | 0 | NaN | MnPrv | NaN | 0 | 6 | 2010 | WD | Normal |
| 1 | 1462 | 20 | RL | 81.0 | 14267 | Pave | NaN | IR1 | Lvl | AllPub | ... | 0 | 0 | NaN | NaN | Gar2 | 12500 | 6 | 2010 | WD | Normal |
| 2 | 1463 | 60 | RL | 74.0 | 13830 | Pave | NaN | IR1 | Lvl | AllPub | ... | 0 | 0 | NaN | MnPrv | NaN | 0 | 3 | 2010 | WD | Normal |
| 3 | 1464 | 60 | RL | 78.0 | 9978 | Pave | NaN | IR1 | Lvl | AllPub | ... | 0 | 0 | NaN | NaN | NaN | 0 | 6 | 2010 | WD | Normal |
| 4 | 1465 | 120 | RL | 43.0 | 5005 | Pave | NaN | IR1 | HLS | AllPub | ... | 144 | 0 | NaN | NaN | NaN | 0 | 1 | 2010 | WD | Normal |
5 rows × 80 columns
====================================================================================================
| File | Number of Instances | Number of Attributes |
|---|---|---|
| train.csv | 1460 | 81 |
| test.csv | 1459 | 80 |
====================================================================================================
train['Set'] = 'Train'
test['Set'] = 'Test'
Data = pd.concat([train,test])
Data = Data.reindex(sorted(Data.columns), axis=1)
del train, test
def Data_Plot(Inp, Title, W = None):
data_info = Inp.dtypes.astype(str).to_frame(name='Data Type')
Temp = Inp.isnull().sum().to_frame(name = 'Number of NaN Values')
data_info = data_info.join(Temp, how='outer')
data_info ['Size'] = Inp.shape[0]
data_info['Percentage'] = 100 - np.round(100*(data_info['Number of NaN Values']/Inp.shape[0]),2)
data_info = data_info.reset_index(drop = False).rename(columns = {'index':'Features'})
#
fig = px.bar(data_info, x= 'Features', y= 'Percentage', color = 'Data Type',
text = 'Percentage',
color_discrete_sequence = ['PaleGreen', 'LightCyan', 'PeachPuff', 'Pink', 'Plum'],
hover_data = data_info.columns)
fig.update_layout(plot_bgcolor= 'white', legend=dict(x=1.01, y=.5, traceorder="normal",
bordercolor="DarkGray", borderwidth=1))
if not W == None:
fig.update_layout(width = W)
fig.update_traces(texttemplate= 10*' ' + '%%{text}', textposition='inside')
fig.update_traces(marker_line_color= 'Black', marker_line_width=1., opacity=1)
fig.update_layout(title={'text': '<b>' + Title + '<b>', 'x':0.5,
'y':0.90, 'xanchor': 'center', 'yanchor': 'top'})
fig.show()
return data_info
def dtypes_group(Inp, Dict = False):
Temp = Inp.dtypes.to_frame(name='Data Type').sort_values(by=['Data Type'])
Out = pd.DataFrame(index =Temp['Data Type'].unique(), columns = ['Features','Count'])
for c in Temp['Data Type'].unique():
Out.loc[Out.index == c, 'Features'] = [Temp.loc[Temp['Data Type'] == c].index.tolist()]
Out.loc[Out.index == c, 'Count'] = len(Temp.loc[Temp['Data Type'] == c].index.tolist())
Out.index.name = 'Data Type'
Out = Out.reset_index(drop = False)
Out['Data Type'] = Out['Data Type'].astype(str)
if Dict:
Out = dict(zip(Out['Data Type'], Out['Features']))
return Out
data_info = Data_Plot(Data, Title = 'House Prices: Advanced Regression Techniques', W = None)
dType = dtypes_group(Data, Dict = True)
def List_Print(Text, List, C = 'Blue', T = 'White'):
BACK = {'Black': Back.BLACK, 'Red':Back.RED, 'Green':Back.GREEN, 'Yellow': Back.YELLOW, 'Blue': Back.BLUE,
'Magenta':Back.MAGENTA, 'Cyan': Back.CYAN}
FORE = {'Black': Fore.BLACK, 'Red':Fore.RED, 'Green':Fore.GREEN, 'Yellow':Fore.YELLOW, 'Blue':Fore.BLUE,
'Magenta':Fore.MAGENTA, 'Cyan':Fore.CYAN, 'White': Fore.WHITE}
print(BACK[C] + FORE[T] + Style.NORMAL + '%s:' % Text + Style.RESET_ALL + ' %s' % ', '.join(List))
List_Print('Features with NaN values', data_info.loc[data_info['Number of NaN Values']>0, 'Features'].values)
Features with NaN values: Alley, BsmtCond, BsmtExposure, BsmtFinSF1, BsmtFinSF2, BsmtFinType1, BsmtFinType2, BsmtFullBath, BsmtHalfBath, BsmtQual, BsmtUnfSF, Electrical, Exterior1st, Exterior2nd, Fence, FireplaceQu, Functional, GarageArea, GarageCars, GarageCond, GarageFinish, GarageQual, GarageType, GarageYrBlt, KitchenQual, LotFrontage, MSZoning, MasVnrArea, MasVnrType, MiscFeature, PoolQC, SalePrice, SaleType, TotalBsmtSF, Utilities
Data_Dict = {}
def Col_Details(Feat, Data = Data, Data_Dict = Data_Dict):
return Data[Feat].replace(Data_Dict[Feat])
Data['Alley'] = Data['Alley'].fillna('NA')
Data_Dict['Alley'] = {'Grvl':'Gravel', 'Pave':'Paved', 'NA': 'No Alley Access'}
Group = Col_Details('Alley', Data = Data, Data_Dict = Data_Dict)
display(Group.value_counts().to_frame('Count').T)
| No Alley Access | Gravel | Paved | |
|---|---|---|---|
| Count | 2721 | 120 | 78 |
Data_Dict['BldgType'] = {'1Fam':'Single-family Detached',
'2FmCon':'Two-family Conversion; originally built as one-family dwelling',
'Duplx':'Duplex','TwnhsE':'Townhouse End Unit','TwnhsI':'Townhouse Inside Unit'}
Group = Col_Details('BldgType', Data = Data, Data_Dict = Data_Dict)
display(Group.value_counts().to_frame('Count').T)
| Single-family Detached | Townhouse End Unit | Duplex | Twnhs | 2fmCon | |
|---|---|---|---|---|---|
| Count | 2425 | 227 | 109 | 96 | 62 |
Data['BsmtCond'] = Data['BsmtCond'].fillna('NA')
Data_Dict['BsmtCond'] = {'Ex':'Excellent','Gd':'Good','TA':'Typical - Slight Dampness Allowed',
'Fa':'Fair - Dampness or Some Cracking or Settling',
'Po':'Poor - Severe Cracking, Settling, or Wetness','NA':'No Basement'}
Group = Col_Details('BsmtCond', Data = Data, Data_Dict = Data_Dict)
display(Group.value_counts().to_frame('Count'))
| Count | |
|---|---|
| Typical - Slight Dampness Allowed | 2606 |
| Good | 122 |
| Fair - Dampness or Some Cracking or Settling | 104 |
| No Basement | 82 |
| Poor - Severe Cracking, Settling, or Wetness | 5 |
Data['BsmtExposure'] = Data['BsmtExposure'].fillna('NA')
Data_Dict['BsmtExposure'] = {'Gd':'Good Exposure',
'Av':'Average Exposure (split levels or foyers typically score average or above)',
'Mn':'Mimimum Exposure','No':'No Exposure','NA':'No Basement'}
Group = Col_Details('BsmtExposure', Data = Data, Data_Dict = Data_Dict)
display(Group.value_counts().to_frame('Count'))
| Count | |
|---|---|
| No Exposure | 1904 |
| Average Exposure (split levels or foyers typically score average or above) | 418 |
| Good Exposure | 276 |
| Mimimum Exposure | 239 |
| No Basement | 82 |
imp = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
Data['BsmtFinSF1'] = imp.fit_transform(Data['BsmtFinSF1'].values.reshape(-1, 1))
del imp
imp = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
Data['BsmtFinSF2'] = imp.fit_transform(Data['BsmtFinSF2'].values.reshape(-1, 1))
del imp
Data['BsmtFinType1'] = Data['BsmtFinType1'].fillna('NA')
Data_Dict['BsmtFinType1'] = {'GLQ':'Good Living Quarters','ALQ':'Average Living Quarters','BLQ':'Below Average Living Quarters',
'Rec':'Average Rec Room','LwQ':'Low Quality','Unf':'Unfinshed','NA':'No Basement'}
Group = Col_Details('BsmtFinType1', Data = Data, Data_Dict = Data_Dict)
display(Group.value_counts().to_frame('Count'))
| Count | |
|---|---|
| Unfinshed | 851 |
| Good Living Quarters | 849 |
| Average Living Quarters | 429 |
| Average Rec Room | 288 |
| Below Average Living Quarters | 269 |
| Low Quality | 154 |
| No Basement | 79 |
Data['BsmtFinType2'] = Data['BsmtFinType2'].fillna('NA')
Data_Dict['BsmtFinType2'] = {'GLQ':'Good Living Quarters','ALQ':'Average Living Quarters','BLQ':'Below Average Living Quarters',
'Rec':'Average Rec Room','LwQ':'Low Quality','Unf':'Unfinshed','NA':'No Basement'}
Group = Col_Details('BsmtFinType2', Data = Data, Data_Dict = Data_Dict)
display(Group.value_counts().to_frame('Count'))
| Count | |
|---|---|
| Unfinshed | 2493 |
| Average Rec Room | 105 |
| Low Quality | 87 |
| No Basement | 80 |
| Below Average Living Quarters | 68 |
| Average Living Quarters | 52 |
| Good Living Quarters | 34 |
display(Data.loc[Data['BsmtFullBath'].isna() == True, Search_List('Bsmt', Data.columns)])
Data['BsmtFullBath'] = Data['BsmtFullBath'].fillna(0)
| BsmtCond | BsmtExposure | BsmtFinSF1 | BsmtFinSF2 | BsmtFinType1 | BsmtFinType2 | BsmtFullBath | BsmtHalfBath | BsmtQual | BsmtUnfSF | TotalBsmtSF | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 660 | NA | NA | 0.0 | 0.0 | NA | NA | NaN | NaN | NaN | NaN | NaN |
| 728 | NA | NA | 0.0 | 0.0 | NA | NA | NaN | NaN | NaN | 0.0 | 0.0 |
display(Data.loc[Data['BsmtHalfBath'].isna() == True, Search_List('Bsmt', Data.columns)])
Data['BsmtHalfBath'] = Data['BsmtHalfBath'].fillna(0)
| BsmtCond | BsmtExposure | BsmtFinSF1 | BsmtFinSF2 | BsmtFinType1 | BsmtFinType2 | BsmtFullBath | BsmtHalfBath | BsmtQual | BsmtUnfSF | TotalBsmtSF | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 660 | NA | NA | 0.0 | 0.0 | NA | NA | 0.0 | NaN | NaN | NaN | NaN |
| 728 | NA | NA | 0.0 | 0.0 | NA | NA | 0.0 | NaN | NaN | 0.0 | 0.0 |
Data['BsmtQual'] = Data['BsmtQual'].fillna('NA')
Data_Dict['BsmtQual'] = {'Ex':'Excellent (100+ inches)','Gd':'Good (90-99 inches)','TA':'Typical (80-89 inches)',
'Fa':'Fair (70-79 inches)','Po':'Poor (<70 inches)','NA':'No Basement'}
Group = Col_Details('BsmtQual', Data = Data, Data_Dict = Data_Dict)
display(Group.value_counts().to_frame('Count').T)
| Typical (80-89 inches) | Good (90-99 inches) | Excellent (100+ inches) | Fair (70-79 inches) | No Basement | |
|---|---|---|---|---|---|
| Count | 1283 | 1209 | 258 | 88 | 81 |
imp = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
Data['BsmtUnfSF'] = imp.fit_transform(Data['BsmtUnfSF'].values.reshape(-1, 1))
del imp
Data_Dict['CentralAir'] = {'N':'No','Y':'Yes'}
Group = Col_Details('CentralAir', Data = Data, Data_Dict = Data_Dict)
display(Group.value_counts().to_frame('Count').T)
| Yes | No | |
|---|---|---|
| Count | 2723 | 196 |
Data_Dict['Condition1'] = {'Artery':'Adjacent to arterial street','Feedr':'Adjacent to feeder street','Norm':'Normal',
'RRNn':"Within 200' of North-South Railroad",'RRAn':'Adjacent to North-South Railroad',
'PosN':'Near positive off-site feature--park, greenbelt, etc.',
'PosA':'Adjacent to postive off-site feature','RRNe':"Within 200' of East-West Railroad",
'RRAe':'Adjacent to East-West Railroad'}
Group = Col_Details('Condition1', Data = Data, Data_Dict = Data_Dict)
display(Group.value_counts().to_frame('Count'))
| Count | |
|---|---|
| Normal | 2511 |
| Adjacent to feeder street | 164 |
| Adjacent to arterial street | 92 |
| Adjacent to North-South Railroad | 50 |
| Near positive off-site feature--park, greenbelt, etc. | 39 |
| Adjacent to East-West Railroad | 28 |
| Adjacent to postive off-site feature | 20 |
| Within 200' of North-South Railroad | 9 |
| Within 200' of East-West Railroad | 6 |
Data_Dict['Condition2'] = {'Artery':'Adjacent to arterial street','Feedr':'Adjacent to feeder street','Norm':'Normal',
'RRNn':"Within 200' of North-South Railroad",'RRAn':'Adjacent to North-South Railroad',
'PosN':'Near positive off-site feature--park, greenbelt, etc.',
'PosA':'Adjacent to postive off-site feature','RRNe':"Within 200' of East-West Railroad",
'RRAe':'Adjacent to East-West Railroad'}
Group = Col_Details('Condition2', Data = Data, Data_Dict = Data_Dict)
display(Group.value_counts().to_frame('Count'))
| Count | |
|---|---|
| Normal | 2889 |
| Adjacent to feeder street | 13 |
| Adjacent to arterial street | 5 |
| Adjacent to postive off-site feature | 4 |
| Near positive off-site feature--park, greenbelt, etc. | 4 |
| Within 200' of North-South Railroad | 2 |
| Adjacent to North-South Railroad | 1 |
| Adjacent to East-West Railroad | 1 |
imp = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
Data['Electrical'] = imp.fit_transform(Data['Electrical'].values.reshape(-1, 1))
del imp
Data_Dict['Electrical'] = {'SBrkr':'Standard Circuit Breakers & Romex',
'FuseA':'Fuse Box over 60 AMP and all Romex wiring (Average)',
'FuseF':'60 AMP Fuse Box and mostly Romex wiring (Fair)',
'FuseP':'60 AMP Fuse Box and mostly knob & tube wiring (poor)',
'Mix':'Mixed'}
Group = Col_Details('Electrical', Data = Data, Data_Dict = Data_Dict)
display(Group.value_counts().to_frame('Count'))
| Count | |
|---|---|
| Standard Circuit Breakers & Romex | 2672 |
| Fuse Box over 60 AMP and all Romex wiring (Average) | 188 |
| 60 AMP Fuse Box and mostly Romex wiring (Fair) | 50 |
| 60 AMP Fuse Box and mostly knob & tube wiring (poor) | 8 |
| Mixed | 1 |
Data_Dict['ExterCond'] = {'Ex':'Excellent','Gd':'Good','TA':'Average/Typical','Fa':'Fair','Po':'Poor'}
Group = Col_Details('ExterCond', Data = Data, Data_Dict = Data_Dict)
display(Group.value_counts().to_frame('Count').T)
| Average/Typical | Good | Fair | Excellent | Poor | |
|---|---|---|---|---|---|
| Count | 2538 | 299 | 67 | 12 | 3 |
Data_Dict['ExterQual'] = {'Ex':'Excellent','Gd':'Good','TA':'Average/Typical','Fa':'Fair','Po':'Poor'}
Group = Col_Details('ExterQual', Data = Data, Data_Dict = Data_Dict)
display(Group.value_counts().to_frame('Count').T)
| Average/Typical | Good | Excellent | Fair | |
|---|---|---|---|---|
| Count | 1798 | 979 | 107 | 35 |
imp = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
Data['Exterior1st'] = imp.fit_transform(Data['Exterior1st'].values.reshape(-1, 1))
del imp
Data_Dict['Exterior1st'] = {'AsbShng':'Asbestos Shingles','AsphShn':'Asphalt Shingles','BrkComm':'Brick Common',
'BrkFace':'Brick Face','CBlock':'Cinder Block','CemntBd':'Cement Board','HdBoard':'Hard Board',
'ImStucc':'Imitation Stucco','MetalSd':'Metal Siding','Other':'Other','Plywood':'Plywood',
'PreCast':'PreCast','Stone':'Stone','Stucco':'Stucco','VinylSd':'Vinyl Siding',
'Wd Sdng':'Wood Siding','WdShing':'Wood Shingles'}
Group = Col_Details('Exterior1st', Data = Data, Data_Dict = Data_Dict)
display(Group.value_counts().to_frame('Count'))
| Count | |
|---|---|
| Vinyl Siding | 1026 |
| Metal Siding | 450 |
| Hard Board | 442 |
| Wood Siding | 411 |
| Plywood | 221 |
| Cement Board | 126 |
| Brick Face | 87 |
| Wood Shingles | 56 |
| Asbestos Shingles | 44 |
| Stucco | 43 |
| Brick Common | 6 |
| Asphalt Shingles | 2 |
| Stone | 2 |
| Cinder Block | 2 |
| Imitation Stucco | 1 |
imp = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
Data['Exterior2nd'] = imp.fit_transform(Data['Exterior2nd'].values.reshape(-1, 1))
del imp
Data_Dict['Exterior2nd'] = {'AsbShng':'Asbestos Shingles','AsphShn':'Asphalt Shingles','BrkComm':'Brick Common',
'BrkFace':'Brick Face','CBlock':'Cinder Block','CemntBd':'Cement Board','HdBoard':'Hard Board',
'ImStucc':'Imitation Stucco','MetalSd':'Metal Siding','Other':'Other','Plywood':'Plywood',
'PreCast':'PreCast','Stone':'Stone','Stucco':'Stucco','VinylSd':'Vinyl Siding',
'Wd Sdng':'Wood Siding','WdShing':'Wood Shingles'}
Group = Col_Details('Exterior2nd', Data = Data, Data_Dict = Data_Dict)
display(Group.value_counts().to_frame('Count'))
| Count | |
|---|---|
| Vinyl Siding | 1015 |
| Metal Siding | 447 |
| Hard Board | 406 |
| Wood Siding | 391 |
| Plywood | 270 |
| CmentBd | 126 |
| Wd Shng | 81 |
| Stucco | 47 |
| Brick Face | 47 |
| Asbestos Shingles | 38 |
| Brk Cmn | 22 |
| Imitation Stucco | 15 |
| Stone | 6 |
| Asphalt Shingles | 4 |
| Cinder Block | 3 |
| Other | 1 |
Data['Fence'] = Data['Fence'].fillna('NA')
Data_Dict['Fence'] = {'GdPrv':'Good Privacy','MnPrv':'Minimum Privacy','GdWo':'Good Wood','MnWw':'Minimum Wood/Wire',
'NA':'No Fence'}
Group = Col_Details('Fence', Data = Data, Data_Dict = Data_Dict)
display(Group.value_counts().to_frame('Count').T)
| No Fence | Minimum Privacy | Good Privacy | Good Wood | Minimum Wood/Wire | |
|---|---|---|---|---|---|
| Count | 2348 | 329 | 118 | 112 | 12 |
Data['FireplaceQu'] = Data['FireplaceQu'].fillna('NA')
Data_Dict['FireplaceQu'] = {'Ex':'Excellent - Exceptional Masonry Fireplace',
'Gd':'Good - Masonry Fireplace in main level',
'TA':'Average - Prefabricated Fireplace in main living area or Masonry Fireplace in basement',
'Fa':'Fair - Prefabricated Fireplace in basement',
'Po':'Poor - Ben Franklin Stove',
'NA':'No Fireplace'}
Group = Col_Details('FireplaceQu', Data = Data, Data_Dict = Data_Dict)
display(Group.value_counts().to_frame('Count'))
| Count | |
|---|---|
| No Fireplace | 1420 |
| Good - Masonry Fireplace in main level | 744 |
| Average - Prefabricated Fireplace in main living area or Masonry Fireplace in basement | 592 |
| Fair - Prefabricated Fireplace in basement | 74 |
| Poor - Ben Franklin Stove | 46 |
| Excellent - Exceptional Masonry Fireplace | 43 |
Data_Dict['Foundation'] = {'BrkTil':'Brick & Tile','CBlock':'Cinder Block','PConc':'Poured Contrete','Slab':'Slab',
'Stone':'Stone','Wood':'Wood'}
Group = Col_Details('Foundation', Data = Data, Data_Dict = Data_Dict)
display(Group.value_counts().to_frame('Count').T)
| Poured Contrete | Cinder Block | Brick & Tile | Slab | Stone | Wood | |
|---|---|---|---|---|---|---|
| Count | 1308 | 1235 | 311 | 49 | 11 | 5 |
imp = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
Data['Functional'] = imp.fit_transform(Data['Functional'].values.reshape(-1, 1))
del imp
Data_Dict['Functional'] = {'Typ':'Typical Functionality','Min1':'Minor Deductions 1','Min2':'Minor Deductions 2',
'Mod':'Moderate Deductions','Maj1':'Major Deductions 1','Maj2':'Major Deductions 2',
'Sev':'Severely Damaged','Sal':'Salvage only'}
Group = Col_Details('Functional', Data = Data, Data_Dict = Data_Dict)
display(Group.value_counts().to_frame('Count').T)
| Typical Functionality | Minor Deductions 2 | Minor Deductions 1 | Moderate Deductions | Major Deductions 1 | Major Deductions 2 | Severely Damaged | |
|---|---|---|---|---|---|---|---|
| Count | 2719 | 70 | 65 | 35 | 19 | 9 | 2 |
imp = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
Data['GarageArea'] = imp.fit_transform(Data['GarageArea'].values.reshape(-1, 1))
del imp
Data['GarageCars'] = Data['GarageCars'].fillna(0).astype(int)
Data['GarageCond'] = Data['GarageCond'].fillna('NA')
Data_Dict['GarageCond'] = {'Ex':'Excellent','Gd':'Good','TA':'Typical/Average','Fa':'Fair','Po':'Poor','NA':'No Garage'}
Group = Col_Details('GarageCond', Data = Data, Data_Dict = Data_Dict)
display(Group.value_counts().to_frame('Count').T)
| Typical/Average | No Garage | Fair | Good | Poor | Excellent | |
|---|---|---|---|---|---|---|
| Count | 2654 | 159 | 74 | 15 | 14 | 3 |
Data['GarageFinish'] = Data['GarageFinish'].fillna('NA')
Data_Dict['GarageFinish'] = {'Fin':'Finished','RFn':'Rough Finished','Unf':'Unfinished','NA':'No Garage'}
Group = Col_Details('GarageFinish', Data = Data, Data_Dict = Data_Dict)
display(Group.value_counts().to_frame('Count').T)
| Unfinished | Rough Finished | Finished | No Garage | |
|---|---|---|---|---|
| Count | 1230 | 811 | 719 | 159 |
Data['GarageQual'] = Data['GarageQual'].fillna('NA')
Data_Dict['GarageQual'] = {'Ex':'Excellent','Gd':'Good','TA':'Typical/Average','Fa':'Fair','Po':'Poor','NA':'No Garage'}
Group = Col_Details('GarageQual', Data = Data, Data_Dict = Data_Dict)
display(Group.value_counts().to_frame('Count').T)
| Typical/Average | No Garage | Fair | Good | Poor | Excellent | |
|---|---|---|---|---|---|---|
| Count | 2604 | 159 | 124 | 24 | 5 | 3 |
Data['GarageType'] = Data['GarageType'].fillna('NA')
Data_Dict['GarageType'] = {'2Types':'More than one type of garage','Attchd':'Attached to home','Basment':'Basement Garage',
'BuiltIn':'Built-In (Garage part of house - typically has room above garage)',
'CarPort':'Car Port','Detchd':'Detached from home','NA':'No Garage'}
Group = Col_Details('GarageType', Data = Data, Data_Dict = Data_Dict)
display(Group.value_counts().to_frame('Count').T)
| Attached to home | Detached from home | Built-In (Garage part of house - typically has room above garage) | No Garage | Basement Garage | More than one type of garage | Car Port | |
|---|---|---|---|---|---|---|---|
| Count | 1723 | 779 | 186 | 157 | 36 | 23 | 15 |
Data['GarageYrBlt'] = Data['GarageYrBlt'].fillna(-1).astype(int)
Data_Dict['Heating'] = {'Floor':'Floor Furnace','GasA':'Gas forced warm air furnace','GasW':'Gas hot water or steam heat',
'Grav':'Gravity furnace','OthW':'Hot water or steam heat other than gas','Wall':'Wall furnace'}
Group = Col_Details('Heating', Data = Data, Data_Dict = Data_Dict)
display(Group.value_counts().to_frame('Count').T)
| Gas forced warm air furnace | Gas hot water or steam heat | Gravity furnace | Wall furnace | Hot water or steam heat other than gas | Floor Furnace | |
|---|---|---|---|---|---|---|
| Count | 2874 | 27 | 9 | 6 | 2 | 1 |
Data_Dict['HeatingQC'] = {'Ex':'Excellent','Gd':'Good','TA':'Average/Typical','Fa':'Fair','Po':'Poor'}
Group = Col_Details('HeatingQC', Data = Data, Data_Dict = Data_Dict)
display(Group.value_counts().to_frame('Count').T)
| Excellent | Average/Typical | Good | Fair | Poor | |
|---|---|---|---|---|---|
| Count | 1493 | 857 | 474 | 92 | 3 |
Data_Dict['HouseStyle'] = {'1Story':'One story','1.5Fin':'One and one-half story: 2nd level finished',
'1.5Unf':'One and one-half story: 2nd level unfinished',
'2Story':'Two story','2.5Fin':'Two and one-half story: 2nd level finished',
'2.5Unf':'Two and one-half story: 2nd level unfinished','SFoyer':'Split Foyer','SLvl':'Split Level'}
Group = Col_Details('HouseStyle', Data = Data, Data_Dict = Data_Dict)
display(Group.value_counts().to_frame('Count'))
| Count | |
|---|---|
| One story | 1471 |
| Two story | 872 |
| One and one-half story: 2nd level finished | 314 |
| Split Level | 128 |
| Split Foyer | 83 |
| Two and one-half story: 2nd level unfinished | 24 |
| One and one-half story: 2nd level unfinished | 19 |
| Two and one-half story: 2nd level finished | 8 |
imp = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
Data['KitchenQual'] = imp.fit_transform(Data['KitchenQual'].values.reshape(-1, 1))
del imp
Data_Dict['KitchenQual'] = {'Ex':'Excellent','Gd':'Good','TA':'Typical/Average','Fa':'Fair','Po':'Poor'}
Group = Col_Details('KitchenQual', Data = Data, Data_Dict = Data_Dict)
display(Group.value_counts().to_frame('Count').T)
| Typical/Average | Good | Excellent | Fair | |
|---|---|---|---|---|
| Count | 1493 | 1151 | 205 | 70 |
Data_Dict['LandContour'] = {'Lvl':'Near Flat/Level','Bnk':'Banked - Quick and significant rise from street grade to building',
'HLS':'Hillside - Significant slope from side to side','Low':'Depression'}
Group = Col_Details('LandContour', Data = Data, Data_Dict = Data_Dict)
display(Group.value_counts().to_frame('Count'))
| Count | |
|---|---|
| Near Flat/Level | 2622 |
| Hillside - Significant slope from side to side | 120 |
| Banked - Quick and significant rise from street grade to building | 117 |
| Depression | 60 |
Data_Dict['LandSlope'] = {'Gtl':'Gentle Slope','Mod':'Moderate Slope','Sev':'Severe Slope'}
Group = Col_Details('LandSlope', Data = Data, Data_Dict = Data_Dict)
display(Group.value_counts().to_frame('Count').T)
| Gentle Slope | Moderate Slope | Severe Slope | |
|---|---|---|---|
| Count | 2778 | 125 | 16 |
Data_Dict['LotConfig'] = {'Inside':'Inside lot','Corner':'Corner lot','CulDSac':'Cul-de-sac',
'FR2':'Frontage on 2 sides of property','FR3':'Frontage on 3 sides of property'}
Group = Col_Details('LotConfig', Data = Data, Data_Dict = Data_Dict)
display(Group.value_counts().to_frame('Count'))
| Count | |
|---|---|
| Inside lot | 2133 |
| Corner lot | 511 |
| Cul-de-sac | 176 |
| Frontage on 2 sides of property | 85 |
| Frontage on 3 sides of property | 14 |
Data['MSZoning'] = Data['MSZoning'].fillna(Data.loc[(Data['LotFrontage'] >=109) &
(Data['LotFrontage'] <=125), 'MSZoning'].mode()[0])
Data_Dict['MSZoning'] = {'A':'Agriculture','C':'Commercial','FV':'Floating Village Residential','I':'Industrial',
'RH':'Residential High Density','RL':'Residential Low Density','RP':'Residential Low Density Park',
'RM':'Residential Medium Density'}
Data['MSZoning'] = Data['MSZoning'].replace({'C (all)':'C'})
Group = Col_Details('MSZoning')
display(Group.value_counts().to_frame('Count'))
| Count | |
|---|---|
| Residential Low Density | 2269 |
| Residential Medium Density | 460 |
| Floating Village Residential | 139 |
| Residential High Density | 26 |
| Commercial | 25 |
mylist = Data['MSZoning'].unique()
for x in mylist:
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
Data.loc[Data['MSZoning'] == x, 'LotFrontage'] = \
imp.fit_transform(Data.loc[Data['MSZoning'] == x, 'LotFrontage'].values.reshape(-1, 1))
del imp
Data_Dict['LotShape'] = {'Reg':'Regular', 'IR1': 'Slightly irregular', 'IR2': 'Moderately Irregular', 'IR3': 'Irregular'}
Group = Col_Details('LotShape', Data = Data, Data_Dict = Data_Dict)
display(Group.value_counts().to_frame('Count').T)
| Regular | Slightly irregular | Moderately Irregular | Irregular | |
|---|---|---|---|---|
| Count | 1859 | 968 | 76 | 16 |
Data_Dict['MSSubClass'] = {20:'1-Story 1946 & Newer All Styles', 30:'1-Story 1945 & Older',
40:'1-Story W/Finished Attic All Ages',45:'1-1/2 Story - Unfinished All Ages',
50:'1-1/2 Story Finished All Ages',60:'2-Story 1946 & Newer',70:'2-Story 1945 & Older',
75:'2-1/2 Story All Ages',80:'Split Or Multi-Level',85:'Split Foyer',
90:'Duplex - All Styles And Ages',120:'1-Story Pud (Planned Unit Development) - 1946 & Newer',
150:'1-1/2 Story Pud - All Ages',160:'2-Story Pud - 1946 & Newer',
180:'Pud - Multilevel - Incl Split Lev/Foyer',190:'2 Family Conversion - All Styles And Ages'}
def Col_Details(Feat, Data = Data, Data_Dict = Data_Dict):
return Data[Feat].replace(Data_Dict[Feat])
Group = Col_Details('MSSubClass')
display(Group.value_counts().to_frame('Count'))
| Count | |
|---|---|
| 1-Story 1946 & Newer All Styles | 1079 |
| 2-Story 1946 & Newer | 575 |
| 1-1/2 Story Finished All Ages | 287 |
| 1-Story Pud (Planned Unit Development) - 1946 & Newer | 182 |
| 1-Story 1945 & Older | 139 |
| 2-Story Pud - 1946 & Newer | 128 |
| 2-Story 1945 & Older | 128 |
| Split Or Multi-Level | 118 |
| Duplex - All Styles And Ages | 109 |
| 2 Family Conversion - All Styles And Ages | 61 |
| Split Foyer | 48 |
| 2-1/2 Story All Ages | 23 |
| 1-1/2 Story - Unfinished All Ages | 18 |
| Pud - Multilevel - Incl Split Lev/Foyer | 17 |
| 1-Story W/Finished Attic All Ages | 6 |
| 1-1/2 Story Pud - All Ages | 1 |
Data['MasVnrArea'] = Data['MasVnrArea'].fillna(0)
imp = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
Data['MasVnrType'] = imp.fit_transform(Data['MasVnrType'].values.reshape(-1, 1))
del imp
# Data['MasVnrType'] = Data['MasVnrType'].fillna(0)
Data_Dict['MasVnrType'] = {'BrkCmn':'Brick Common','BrkFace':'Brick Face','CBlock':'Cinder Block','None':'None',
'Stone':'Stone'}
Group = Col_Details('MasVnrType', Data = Data, Data_Dict = Data_Dict)
display(Group.value_counts().to_frame('Count').T)
| None | Brick Face | Stone | Brick Common | |
|---|---|---|---|---|
| Count | 1766 | 879 | 249 | 25 |
Data['MiscFeature'] = Data['MiscFeature'].fillna('NA')
Data_Dict['MiscFeature'] = {'Elev':'Elevator','Gar2':'2nd Garage (if not described in garage section)','Othr':'Other',
'Shed':'Shed (over 100 SF)','TenC':'Tennis Court','NA':'None'}
Group = Col_Details('MiscFeature', Data = Data, Data_Dict = Data_Dict)
display(Group.value_counts().to_frame('Count').T)
| None | Shed (over 100 SF) | 2nd Garage (if not described in garage section) | Other | Tennis Court | |
|---|---|---|---|---|---|
| Count | 2814 | 95 | 5 | 4 | 1 |
Data['Neighborhood'] = Data['Neighborhood'].replace({'NAmes':'Names'})
Data_Dict['Neighborhood'] = {'Blmngtn':'Bloomington Heights','Blueste':'Bluestem','BrDale':'Briardale','BrkSide':'Brookside',
'ClearCr':'Clear Creek','CollgCr':'College Creek','Crawfor':'Crawford','Edwards':'Edwards',
'Gilbert':'Gilbert','IDOTRR':'Iowa DOT and Rail Road','MeadowV':'Meadow Village',
'Mitchel':'Mitchell','Names':'North Ames','NoRidge':'Northridge','NPkVill':'Northpark Villa',
'NridgHt':'Northridge Heights','NWAmes':'Northwest Ames','OldTown':'Old Town',
'SWISU':'South & West of Iowa State University','Sawyer':'Sawyer','SawyerW':'Sawyer West',
'Somerst':'Somerset','StoneBr':'Stone Brook','Timber':'Timberland','Veenker':'Veenker'}
Group = Col_Details('Neighborhood', Data = Data, Data_Dict = Data_Dict)
display(Group.value_counts().to_frame('Count'))
| Count | |
|---|---|
| North Ames | 443 |
| College Creek | 267 |
| Old Town | 239 |
| Edwards | 194 |
| Somerset | 182 |
| Northridge Heights | 166 |
| Gilbert | 165 |
| Sawyer | 151 |
| Northwest Ames | 131 |
| Sawyer West | 125 |
| Mitchell | 114 |
| Brookside | 108 |
| Crawford | 103 |
| Iowa DOT and Rail Road | 93 |
| Timberland | 72 |
| Northridge | 71 |
| Stone Brook | 51 |
| South & West of Iowa State University | 48 |
| Clear Creek | 44 |
| Meadow Village | 37 |
| Briardale | 30 |
| Bloomington Heights | 28 |
| Veenker | 24 |
| Northpark Villa | 23 |
| Bluestem | 10 |
Data_Dict['OverallQual'] = {10:'Very Excellent',9:'Excellent',8:'Very Good',7:'Good',6:'Above Average',5:'Average',
4:'Below Average',3:'Fair',2:'Poor',1:'Very Poor'}
Group = Col_Details('OverallQual', Data = Data, Data_Dict = Data_Dict)
display(Group.value_counts().to_frame('Count').T)
| Average | Above Average | Good | Very Good | Below Average | Excellent | Fair | Very Excellent | Poor | Very Poor | |
|---|---|---|---|---|---|---|---|---|---|---|
| Count | 825 | 731 | 600 | 342 | 226 | 107 | 40 | 31 | 13 | 4 |
Data_Dict['PavedDrive'] = {'Y':'Paved','P':'Partial Pavement','N':'Dirt/Gravel'}
Group = Col_Details('PavedDrive', Data = Data, Data_Dict = Data_Dict)
display(Group.value_counts().to_frame('Count').T)
| Paved | Dirt/Gravel | Partial Pavement | |
|---|---|---|---|
| Count | 2641 | 216 | 62 |
Data['PoolQC'] = Data['PoolQC'].fillna('NA')
Data_Dict['PoolQC'] = {'Ex':'Excellent','Gd':'Good','TA':'Average/Typical','Fa':'Fair','NA':'No Pool'}
Group = Col_Details('PoolQC', Data = Data, Data_Dict = Data_Dict)
display(Group.value_counts().to_frame('Count').T)
| No Pool | Excellent | Good | Fair | |
|---|---|---|---|---|
| Count | 2909 | 4 | 4 | 2 |
Data_Dict['RoofMatl'] = {'ClyTile':'Clay or Tile','CompShg':'Standard (Composite) Shingle','Membran':'Membrane','Metal':'Metal',
'Roll':'Roll','Tar&Grv':'Gravel & Tar','WdShake':'Wood Shakes','WdShngl':'Wood Shingles'}
Group = Col_Details('RoofMatl', Data = Data, Data_Dict = Data_Dict)
display(Group.value_counts().to_frame('Count').T)
| Standard (Composite) Shingle | Gravel & Tar | Wood Shakes | Wood Shingles | Membrane | Roll | Clay or Tile | Metal | |
|---|---|---|---|---|---|---|---|---|
| Count | 2876 | 23 | 9 | 7 | 1 | 1 | 1 | 1 |
Data_Dict['RoofStyle'] = {'Flat':'Flat','Gable':'Gable','Gambrel':'Gabrel (Barn)','Hip':'Hip','Mansard':'Mansard','Shed':'Shed'}
Group = Col_Details('RoofStyle', Data = Data, Data_Dict = Data_Dict)
display(Group.value_counts().to_frame('Count').T)
| Gable | Hip | Gabrel (Barn) | Flat | Mansard | Shed | |
|---|---|---|---|---|---|---|
| Count | 2310 | 551 | 22 | 20 | 11 | 5 |
Data_Dict['SaleCondition'] = {'Normal':'Normal Sale','Abnorml':'Abnormal Sale - trade, foreclosure, short sale',
'AdjLand':'Adjoining Land Purchase',
'Alloca':'Allocation - two linked properties with separate deeds, typically condo with a garage unit',
'Family':'Sale between family members',
'Partial':'Home was not completed when last assessed (associated with New Homes)'}
Group = Col_Details('SaleCondition', Data = Data, Data_Dict = Data_Dict)
display(Group.value_counts().to_frame('Count').T)
| Normal Sale | Home was not completed when last assessed (associated with New Homes) | Abnormal Sale - trade, foreclosure, short sale | Sale between family members | Allocation - two linked properties with separate deeds, typically condo with a garage unit | Adjoining Land Purchase | |
|---|---|---|---|---|---|---|
| Count | 2402 | 245 | 190 | 46 | 24 | 12 |
imp = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
Data['SaleType'] = imp.fit_transform(Data['SaleType'].values.reshape(-1, 1))
del imp
Data_Dict['SaleType'] = {'WD':'Warranty Deed - Conventional','CWD':'Warranty Deed - Cash','VWD':'Warranty Deed - VA Loan',
'New':'Home just constructed and sold','COD':'Court Officer Deed/Estate',
'Con':'Contract 15% Down payment regular terms','ConLw':'Contract Low Down payment and low interest',
'ConLI':'Contract Low Interest','ConLD':'Contract Low Down','Oth':'Other'}
Group = Col_Details('SaleType', Data = Data, Data_Dict = Data_Dict)
display(Group.value_counts().to_frame('Count').T)
| Warranty Deed - Conventional | Home just constructed and sold | Court Officer Deed/Estate | Contract Low Down | Warranty Deed - Cash | Contract Low Interest | Contract Low Down payment and low interest | Other | Contract 15% Down payment regular terms | |
|---|---|---|---|---|---|---|---|---|---|
| Count | 2526 | 239 | 87 | 26 | 12 | 9 | 8 | 7 | 5 |
Data_Dict['Street'] = {'Grvl':'Gravel', 'Pave':'Paved'}
Group = Col_Details('Street', Data = Data, Data_Dict = Data_Dict)
display(Group.value_counts().to_frame('Count').T)
| Paved | Gravel | |
|---|---|---|
| Count | 2907 | 12 |
imp = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
Data['TotalBsmtSF'] = imp.fit_transform(Data['TotalBsmtSF'].values.reshape(-1, 1))
del imp
imp = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
Data['Utilities'] = imp.fit_transform(Data['Utilities'].values.reshape(-1, 1))
del imp
Data_Dict['Utilities'] = {'AllPub':'All public Utilities (E,G,W,& S)','NoSewr':'Electricity, Gas, and Water (Septic Tank)',
'NoSeWa':'Electricity and Gas Only', 'ELO':'Electricity only'}
Group = Col_Details('Utilities', Data = Data, Data_Dict = Data_Dict)
display(Group.value_counts().to_frame('Count'))
| Count | |
|---|---|
| All public Utilities (E,G,W,& S) | 2918 |
| Electricity and Gas Only | 1 |
_ = Data_Plot(Data, Title = 'House Prices: Advanced Regression Techniques', W = None)
Data = Data.rename(columns = {'Id':'ID'})
Data.to_csv(PATH + '/House_Prices.csv' , index = None, header=True)
with open(PATH + '/House_Prices_Data_Dict.pkl', 'wb') as fp:
pickle.dump(Data_Dict, fp, protocol=pickle.HIGHEST_PROTOCOL)